Visualize Cell Health Predictions on Repurposing Hub Data

Gregory Way, 2019

In [1]:
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(ggplot2))

source(file.path("scripts", "visualize_utils.R"))

Set Constants

In [2]:
consensus <- "modz"

output_dir <- file.path("figures", "umap", consensus)
dir.create(output_dir)
Warning message in dir.create(output_dir):
“'figures/umap/modz' already exists”

Load and Process Data

In [3]:
# Load Data
data_dir <- file.path("repurposing_cellhealth_shiny", "data")
real_file <- file.path(
    data_dir,
    paste0("moa_cell_health_", consensus, ".tsv.gz")
)

cp_embedding_df <- readr::read_tsv(real_file, col_types = readr::cols())

cp_embedding_df <- cp_embedding_df %>%
    dplyr::mutate(Metadata_Treatment = "Compound")

cp_embedding_df$Metadata_Treatment[cp_embedding_df$Metadata_broad_core_id == "DMSO"] = "DMSO"

print(dim(cp_embedding_df))
head(cp_embedding_df, 3)
[1] 10368    88
A tibble: 3 × 88
Metadata_Plate_Map_NameMetadata_broad_core_idMetadata_broad_sampleMetadata_dose_recodeMetadata_mmoles_per_literMetadata_pert_wellumap_xumap_ybroad_idpert_inamecell_health_modz_target_cc_g2_high_h2axcell_health_modz_target_cc_late_mitosis_n_spots_h2ax_meancell_health_modz_target_cc_cc_high_h2axcell_health_modz_target_vb_percent_dead_onlycell_health_modz_target_cc_s_high_h2axcell_health_modz_target_cc_cc_n_spots_h2ax_per_nucleus_area_meancell_health_modz_target_cc_s_n_spots_h2ax_per_nucleus_area_meancell_health_modz_target_cc_g1_plus_g2_countcell_health_modz_target_vb_live_cell_width_lengthMetadata_Treatment
<chr><chr><chr><int><dbl><chr><dbl><dbl><chr><chr><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><chr>
C-7161-01-LM6-001BRD-A25234499BRD-A25234499-001-18-3610.0000B13-1.580029-0.9219316BRD-A25234499aminoglutethimide0.27519570.23198430.034737310.22677670.2373750.105484740.1536775-0.5309856 0.14445485Compound
C-7161-01-LM6-001BRD-A25234499BRD-A25234499-001-18-35 3.3333B14-1.904766-2.4360995BRD-A25234499aminoglutethimide0.51695020.24215890.012577280.63289030.3998580.022915390.2027969-0.1597692-0.29454513Compound
C-7161-01-LM6-001BRD-A25234499BRD-A25234499-001-18-34 1.1111B15-1.899164-2.5914881BRD-A25234499aminoglutethimide0.40124960.26825450.080886200.20743220.3372090.180616970.2859738-0.2541855-0.04831094Compound

Visualize Metadata and Cell Health Variables

In [4]:
ggplot(cp_embedding_df,
       aes(x = umap_x, y = umap_y)) +
    geom_point(aes(color = Metadata_dose_recode,
                   size = paste(Metadata_Treatment)),
               pch = 16,
               alpha = 0.6) +
    theme_bw() +
    scale_color_viridis_c(name = "Dose\nRecoded") +
    scale_size_discrete("Treatment",
                        range = c(0.5, 2)) +
    xlab("UMAP 1") +
    ylab("UMAP 2")

output_file <- file.path(
    output_dir,
    paste0("umap_repurposing_cell_painting_dose_consensus_", consensus, ".png")
)
ggsave(output_file, height = 5, width = 6, dpi = 500)
Warning message:
“Using size for a discrete variable is not advised.”
In [5]:
ggplot(cp_embedding_df,
       aes(x = umap_x, y = umap_y)) +
    geom_point(aes(color = Metadata_dose_recode,
                   size = paste(Metadata_Treatment)),
               pch = 16,
               alpha = 0.6) +
    theme_bw() +
    scale_color_viridis_c(name = "Dose\nRecoded") +
    scale_size_discrete("Treatment",
                        range = c(0.5, 2)) +
    xlab("UMAP 1") +
    ylab("UMAP 2")

output_file <- file.path(
    output_dir,
    paste0("umap_repurposing_cell_painting_dose_consensus_", consensus, ".png")
)
ggsave(output_file, height = 5, width = 6, dpi = 500)
Warning message:
“Using size for a discrete variable is not advised.”
In [6]:
ggplot(cp_embedding_df %>% dplyr::filter(Metadata_Treatment == "DMSO"),
       aes(x = umap_x, y = umap_y)) +
    geom_point(aes(color = Metadata_pert_well),
               pch = 16,
               size = 2,
               alpha = 0.6) +
    geom_point(data = cp_embedding_df,
               color = "grey",
               alpha = 0.1,
               size = 0.5) +
    theme_bw() +
    theme(legend.position = "none") +
    xlab("UMAP 1") +
    ylab("UMAP 2")

output_file <- file.path(
    output_dir,
    paste0("umap_repurposing_cell_painting_dose_consensus_dmso_", consensus, ".png")
)

ggsave(output_file, height = 5, width = 6, dpi = 500)

Certain Models had Better Performance in A549

We applied all models to the Drug Repurposing Set data.

Here, output visualizations of several of the top models.

Performance Rank

In [7]:
# Load feature mapping
mapping_dir <- file.path("..", "1.generate-profiles", "data", "labels")
mapping_file <- file.path(mapping_dir, "feature_mapping_annotated.csv")
map_df <- readr::read_csv(
    mapping_file,
    col_types = readr::cols(.default = readr::col_character())
)

print(dim(map_df))
tail(map_df, 3)
[1] 75 16
A tibble: 3 × 16
idreadable_nameoriginal_namefeature_typemeasurementgate_requiredassayhoechsteduph3gh2axcaspasedraq7cell_roxdpcdescription
<chr><chr><chr><chr><chr><chr><chr><chr><chr><chr><chr><chr><chr><chr><chr><chr>
cc_g2_g1_count Cell Count - G2/G1 G2/G1 cell_cyclecell_cycle_count1hoechst_edu_ph311100000G2/G1
cc_g1_plus_g2_count Cell Count - G1+G2 G1+G2 cell_cyclecell_cycle_count1hoechst_edu_ph311100000G1+G2
cc_g2_plus_all_m_countCell Count - G2 + MG2 + All M-phasecell_cyclecell_cycle_count1hoechst_edu_ph311100000G2+M

Visualizing Specific Cell Health Models

Only a few are shown here, but all are saved in a separate folder.

Predicting Number of Live Cells

In [8]:
visualize_umap(
    cp_embedding_df,
    target_variable = "cell_health_modz_target_vb_num_live_cells",
    legend_title = "Num Live Cells",
    output_dir = "none",
    save = FALSE,
    print_figure = FALSE
)

Predicting Live Width:Length

In [9]:
visualize_umap(
    cp_embedding_df,
    target_variable = "cell_health_modz_target_vb_live_cell_width_length",
    legend_title = "Live Cell\n(Width:Length)",
    output_dir = "none",
    save = FALSE,
    print_figure = FALSE
)

Predicting Live Cell Roundness

In [10]:
visualize_umap(
    cp_embedding_df,
    target_variable = "cell_health_modz_target_vb_live_cell_roundness",
    legend_title = "Live Cell Roundness",
    output_dir = "none",
    save = FALSE,
    print_figure = FALSE
)

Predicting Number of Objects

In [11]:
visualize_umap(
    cp_embedding_df,
    target_variable = "cell_health_modz_target_cc_all_n_objects",
    legend_title = "Number of Objects",
    output_dir = "none",
    save = FALSE,
    print_figure = FALSE
)

Predicting Live Cell Area

In [12]:
visualize_umap(
    cp_embedding_df,
    target_variable = "cell_health_modz_target_vb_live_cell_area",
    legend_title = "Live Cell Area",
    output_dir = "none",
    save = FALSE,
    print_figure = FALSE
)

Predicting Number of Objects in Cell Cycle Stage

In [13]:
visualize_umap(
    cp_embedding_df,
    target_variable = "cell_health_modz_target_cc_cc_n_objects",
    legend_title = "Num Cell\nCycle Objects",
    output_dir = "none",
    save = FALSE,
    print_figure = FALSE
)

Predicting Number of Objects in G1

In [14]:
visualize_umap(
    cp_embedding_df,
    target_variable = "cell_health_modz_target_cc_g1_n_objects",
    legend_title = "G1 Objects",
    output_dir = "none",
    save = FALSE
)

Predicting EdU positive intensity

In [15]:
visualize_umap(
    cp_embedding_df,
    target_variable = "cell_health_modz_target_cc_s_intensity_nucleus_area_mean",
    legend_title = "Sum S phase",
    output_dir = "none",
    save = FALSE
)

Predicting ROS

In [16]:
visualize_umap(
    cp_embedding_df,
    target_variable = "cell_health_modz_target_vb_ros_back_mean",
    legend_title = "ROS Background",
    output_dir = "none",
    save = FALSE
)

Visualize All Cell Health Variables

In [17]:
cell_health_variables <- colnames(
    cp_embedding_df %>%
        dplyr::select(starts_with("cell_health_modz_target_"))
    )

length(cell_health_variables)
70
In [18]:
pdf_file <- file.path(
    output_dir,
    paste0("repurposing_hub_umaps_consensus_", consensus, ".pdf")
)
pdf(pdf_file, width = 5, height = 5, onefile = TRUE)

for (cell_health_variable in cell_health_variables) {
    umap_gg <- visualize_umap(
        df = cp_embedding_df,
        target_variable = cell_health_variable,
        legend_title = "Prediction:",
        title = cell_health_variable,
        dpi = 200,
        save_figure = FALSE
    )
}

dev.off()
pdf: 2

Visualize Model Scores on DMSO samples

In [19]:
assay_theme_file <- file.path("..", "3.train", "scripts", "assay_themes.R")
source(assay_theme_file)
In [20]:
col_types <- readr::cols(
    .default = readr::col_character(),
    shuffle_false = readr::col_double(),
    shuffle_true = readr::col_double()
)

rank_file <- file.path(
    "repurposing_cellhealth_shiny",
    "data",
    paste0("A549_ranked_models_regression_", consensus, ".tsv")
)
model_rank_df <- readr::read_tsv(rank_file, col_types = col_types)

# Recode the target variable
model_rank_df$target <- paste0("cell_health_", consensus, "_target_", model_rank_df$target)

head(model_rank_df, 3)
A tibble: 3 × 10
metrictargetoriginal_namereadable_namefeature_typemeasurementassaydescriptionshuffle_falseshuffle_true
<chr><chr><chr><chr><chr><chr><chr><chr><dbl><dbl>
r_twocell_health_modz_target_vb_live_cell_width_lengthLive Width:Length Live Width / Lengthviabilityshapedpc Width/Length 0.8916483 6.635413e-03
r_twocell_health_modz_target_vb_num_live_cells # Live Cells # Live Cells viabilitydeathdraqNumber of live cells0.8811216-5.934792e-06
r_twocell_health_modz_target_vb_live_cell_roundness Live Cell RoundnessLive Cell Roundnessviabilityshapedpc Cell Roundness 0.8552069 2.012126e-01
In [21]:
dmso_embeddings_df <- cp_embedding_df %>%
    dplyr::filter(Metadata_Treatment == "DMSO")

non_dmso_embeddings_df <- cp_embedding_df %>%
    dplyr::filter(Metadata_Treatment != "DMSO")
In [22]:
std_dev_dmso_features <- apply(
    dmso_embeddings_df %>% 
        dplyr::select(matches("cell_health_modz_target")),
    2,
    sd
)
std_dev_compound_features <- apply(
    non_dmso_embeddings_df %>%
        dplyr::select(matches("cell_health_modz_target")),
    2, 
    sd
)
In [23]:
std_dev_all_df <- dplyr::bind_cols(
    as.data.frame(std_dev_dmso_features),
    as.data.frame(std_dev_compound_features)
) %>%
    dplyr::mutate(
        features = colnames(dmso_embeddings_df %>%
                                dplyr::select(matches("cell_health_modz_target")))
    ) %>%
    dplyr::left_join(model_rank_df, by = c("features" = "target")) 

good_performing <- std_dev_all_df %>%
    dplyr::filter(shuffle_false > 0)

bad_performing <- std_dev_all_df %>%
    dplyr::filter(shuffle_false <= 0)

std_dev_good_df <- good_performing %>%
    dplyr::mutate(performance_scaled = (
        good_performing$shuffle_false - min(good_performing$shuffle_false)
    ) / (
        max(good_performing$shuffle_false) - min(good_performing$shuffle_false)
    )
                  )

print(dim(std_dev_good_df))
head(std_dev_good_df, 2)
[1] 37 13
A data.frame: 2 × 13
std_dev_dmso_featuresstd_dev_compound_featuresfeaturesmetricoriginal_namereadable_namefeature_typemeasurementassaydescriptionshuffle_falseshuffle_trueperformance_scaled
<dbl><dbl><chr><chr><chr><chr><chr><chr><chr><chr><dbl><dbl><dbl>
10.059678780.2168907cell_health_modz_target_cc_late_mitosis_n_spots_h2ax_per_nucleus_area_meanr_twopH3 neg, Hoechst cond (late mitosis) - Number of Spots per Area of Nucleus - ...Late M - # of gH2AX Spots per Area of Nucleuscell_cycledna_damagehoechst_edu_ph3_gh2axIn late M cells: average number of gH2Ax spots per nucleus area0.01027311-0.080457870.001896058
20.350653371.5849058cell_health_modz_target_cc_g1_n_objects r_twoG1 - Number of Objects G1 - # cells cell_cycleg1_phase hoechst_edu_ph3 Number of G1 cells 0.77424668 0.061784930.867049811
In [24]:
ggplot(std_dev_good_df,
       aes(x = std_dev_dmso_features, y = std_dev_compound_features)) +
    geom_point(aes(color = assay, size = performance_scaled),
               alpha = 0.8) +
    geom_point(data = bad_performing,
               aes(color = assay),
               size = 1,
               alpha = 0.5) +
    theme_bw() +
    geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
    scale_color_manual(name = "Assay",
                       values = dye_colors,
                       labels = dye_labels) +
    scale_size_continuous(name = "Performance Scaled") + 
    xlab("Standard Deviation across DMSO Well Consensus") +
    ylab("Standard Deviation across all Consensus Compound Treatments") +
    theme(axis.text = element_text(size = 8),
          axis.title = element_text(size = 10),
          strip.text = element_text(size = 6),
          legend.text = element_text(size = 6),
          legend.title = element_text(size = 8),
         legend.key.size = unit(0.4, "cm"))

output_file <- file.path(output_dir, "dmso_vs_compound_standard_deviation.png")
ggsave(output_file, height = 5, width = 6, dpi = 500)